library(readxl)
data_train <- read_excel("Data/data_train.xlsx")
#install.packages(c("ggplot2", "sf", "dplyr"))
# Load libraries
library(ggplot2)
library(sf)
## Linking to GEOS 3.11.0, GDAL 3.5.3, PROJ 9.1.0; sf_use_s2() is TRUE
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(dplyr)
| Category | Variable | Description |
|---|---|---|
| Continuous | precio.house.m2 | Property sale price in euros per square meter |
| antig | Age of the property (years) | |
| Ruidos_ext | External noise (%) | |
| Mal_olor | Pollution or bad odors (%) | |
| Poca_limp | Lack of street cleanliness (%) | |
| Malas_comunic | Poor communications (%) | |
| Pocas_zonas | Few green spaces (%) | |
| Delincuencia | Crime (%) | |
| CO | Level of CO in the air at the property’s coordinates (standardized values) | |
| NO2 | Level of NO2 in the air at the property’s coordinates (standardized values) | |
| Nox | Level of Nox in the air at the property’s coordinates (standardized values) | |
| O3 | Level of O3 in the air at the property’s coordinates (standardized values) | |
| SO2 | Level of SO2 in the air at the property’s coordinates (standardized values) | |
| PM10 | Level of CO in the air at the property’s coordinates (standardized values) | |
| Pobl.0_14_div_Poblac.Total | Percentage of children between 0 and 14 years in the district | |
| PoblJubilada_div_Poblac.Total | Percentage of retired population in the district | |
| Inmigrantes.porc | Percentage of immigrant population in the district | |
| sup.const | Built area of the property | |
| sup.util | Usable area of the property | |
| Categorical | barrio/cod_barrio | Code or name of the neighborhood in the city of Madrid |
| distrito/cod_distrito | Code or name of the district in the city of Madrid | |
| longitud/latitud | Geographical coordinates of the property (longitude and latitude) | |
| dorm | Number of bedrooms | |
| banos | Number of bathrooms | |
| tipo.casa | Type of property | |
| estado | Condition of the property | |
| Binary | inter.exter | Interior or exterior design of the property |
| ascensor | Elevator availability in the building | |
| comercial | Indicates if the property is located in a commercial area | |
| casco.historico | Indicates if the property is in Madrid’s historic center | |
| M.30 | Indicates if the property is within the M-30 | |
| Identifier | ref.hip.zona | Mortgage reference of the area |
| train_indices | ID of the observation |
colnames(data_train)
## [1] "train_indices" "barrio"
## [3] "cod_barrio" "distrito"
## [5] "cod_distrito" "longitud"
## [7] "latitud" "precio.house.m2"
## [9] "sup.const" "sup.util"
## [11] "ref.hip.zona" "dorm"
## [13] "banos" "tipo.casa"
## [15] "inter.exter" "ascensor"
## [17] "estado" "antig"
## [19] "comercial" "casco.historico"
## [21] "Ruidos_ext" "Mal_olor"
## [23] "Poca_limp" "Malas_comunic"
## [25] "Pocas_zonas" "Delincuencia"
## [27] "M.30" "CO"
## [29] "NO2" "Nox"
## [31] "O3" "SO2"
## [33] "PM10" "Pobl.0_14_div_Poblac.Total"
## [35] "PoblJubilada_div_Poblac.Total" "Inmigrantes.porc"
# turn identifiers into strings to prevent errors
data_train[, c("train_indices", "cod_barrio", "cod_distrito", "ref.hip.zona ", "casco.historico", "M.30","comercial")] <- lapply(data_train[, c("train_indices", "cod_barrio", "cod_distrito", "ref.hip.zona", "casco.historico", "M.30","comercial")], as.character)
unique_counts <- data.frame(
Column = names(data_train),
Unique_Values = sapply(data_train, function(x) length(unique(x)))
)
unique_counts
## Column Unique_Values
## train_indices train_indices 736
## barrio barrio 118
## cod_barrio cod_barrio 116
## distrito distrito 21
## cod_distrito cod_distrito 21
## longitud longitud 712
## latitud latitud 720
## precio.house.m2 precio.house.m2 567
## sup.const sup.const 164
## sup.util sup.util 160
## ref.hip.zona ref.hip.zona 581
## dorm dorm 8
## banos banos 6
## tipo.casa tipo.casa 6
## inter.exter inter.exter 2
## ascensor ascensor 2
## estado estado 7
## antig antig 174
## comercial comercial 2
## casco.historico casco.historico 2
## Ruidos_ext Ruidos_ext 38
## Mal_olor Mal_olor 43
## Poca_limp Poca_limp 45
## Malas_comunic Malas_comunic 48
## Pocas_zonas Pocas_zonas 57
## Delincuencia Delincuencia 55
## M.30 M.30 2
## CO CO 691
## NO2 NO2 412
## Nox Nox 412
## O3 O3 625
## SO2 SO2 736
## PM10 PM10 736
## Pobl.0_14_div_Poblac.Total Pobl.0_14_div_Poblac.Total 19
## PoblJubilada_div_Poblac.Total PoblJubilada_div_Poblac.Total 19
## Inmigrantes.porc Inmigrantes.porc 19
## ref.hip.zona ref.hip.zona 581
plot_hist = function(data_train, type = "both", n, m) {
# Configure subplots
par(mfrow = c(n, m),mar = c(10, 4, 4, 1))
# Loop through columns
for (col in colnames(data_train)) {
# plot numeric columns
if (is.numeric(data_train[[col]]) && (type == "numeric" || type == "both")) {
# Create histograms
hist(data_train[[col]],
main = paste(col),
xlab = "",
col = "dodgerblue4")
# plot non-numeric columns
} else if (!is.numeric(data_train[[col]]) && (type == "categoric"
|| type == "both")) {
# Adjust x-axis label arrangement for specific cols
barplot(table(data_train[[col]]),
main = paste(col),
xlab = "",
col = "dodgerblue",
las = 2)
}
}
# Reset the plotting layout
par(mfrow = c(1, 1))
}
# Numeric hist
plot_hist(data = data_train ,type="numeric", n=3, m=2)
# Numeric hist
plot_hist(data = data_train ,type="categoric", n=3, m=2)
TODO: Conditional histograms for categoricals (barrios)
# Install and load necessary packages
library(reshape2)
library(ggplot2)
library(GGally) # This is required for ggpairs
# Compute correlation matrix
numeric_column_names <- names(data_train)[sapply(data_train, is.numeric)]
corr_mat <- round(cor(data_train[, numeric_column_names]), 2)
melted_corr_mat <- melt(corr_mat)
# Plot correlation heatmap
ggplot(data = melted_corr_mat, aes(x = Var2, y = Var1, fill = value)) +
geom_tile() +
geom_text(aes(label = value), color = "black", size = 4)+
scale_fill_gradient(low = "white", high = "red") +
theme_minimal()
# Pairplot using ggpairs
#ggpairs(data_train[, numeric_column_names],
# aes(color = data_train$group_var))
average_price_per_barrio <- data_train %>%
group_by(barrio) %>%
summarise(average_price = mean(`precio.house.m2`, na.rm = TRUE))
ggplot(average_price_per_barrio, aes(x = reorder(barrio, average_price), y = average_price)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() + # Flip coordinates for better readability if there are many barrios
labs(title = "Average Price per 2m by Barrio",
x = "Barrio",
y = "Average Price per 2m") +
theme_minimal()
average_price_per_district <- data_train %>%
group_by(distrito) %>%
summarise(average_price = mean(`precio.house.m2`, na.rm = TRUE))
ggplot(average_price_per_district, aes(x = reorder(distrito, average_price), y = average_price)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() + # Flip coordinates for better readability if there are many barrios
labs(title = "Average Price per 2m by District",
x = "District",
y = "Average Price per 2m") +
theme_minimal()
points_sf <- st_as_sf(data_train, coords = c("longitud", "latitud"), crs = 4326)
points_sf <- points_sf[order(points_sf$precio.house.m2), ]
# Download the map of Madrid (use a simple shapefile, for example)
madrid_map <- st_read("Data/madrid.geojson")
## Reading layer `madrid' from data source
## `/Users/Simon 1/Documents/GitHub/uc3m_regression_competition/Data/madrid.geojson'
## using driver `GeoJSON'
## Simple feature collection with 128 features and 15 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -3.887647 ymin: 40.31325 xmax: -3.516929 ymax: 40.64444
## Geodetic CRS: WGS 84
# Plot the map with the points colored by the 'value' variable
ggplot() +
geom_sf(data = madrid_map, fill = "lightgray", color = "white") + # Plot map
geom_sf(data = points_sf, aes(color = precio.house.m2), size = 1.5) + # Plot points
scale_color_gradient(low = "#00C5CD", high = "red") +
labs(title = "Spatial situation and price",
x = "Longitude", y = "Latitude") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Rotate x-axis labels
### Done with teh help of ChatGPT